Four Maps of Harrisonburg¶

In [ ]:
import ipyparallel as ipp

cluster = ipp.Cluster.from_file("C:\Users\joost\.ipython\profile_default\security\cluster-.json")
rc = cluster.connect_client_sync()
rc

The following code creates four different maps of Harrisonburg.

In [45]:
import os
import zipfile
import pandas as pd
import geopandas as gpd
In [46]:
# Set directory path
data_dir = 'data'

# Check for .zip files in the directory
zip_files = [f for f in os.listdir(data_dir) if f.endswith('.zip')]

# Unzip all found .zip files
for zip_file in zip_files:
    with zipfile.ZipFile(os.path.join(data_dir, zip_file), 'r') as zip_ref:
        zip_ref.extractall(data_dir)

# Now, look for .kml files in the directory after unzipping
kml_files = [f for f in os.listdir(data_dir) if f.endswith('.kml')]
In [47]:
# Initialize an empty list to store data
data = []

# Iterate over .kml files, reading each into GeoDataFrame and extracting necessary information
for kml_file in kml_files:
    kml_path = os.path.join(data_dir, kml_file)
    try:
        # Read the KML file with geopandas
        gdf = gpd.read_file(kml_path)
        
        # Append each row of the GeoDataFrame with the file name to our data list
        for _, row in gdf.iterrows():
            data.append({'file_name': kml_file, 'geometry': row['geometry']})
    except Exception as e:
        print(f"Could not read {kml_file}: {e}")

# Convert the data list into a DataFrame
df_kml = pd.DataFrame(data)
In [48]:
import re
df_kml['file_id'] = df_kml['file_name'].apply(lambda x: re.search(r'_(\d+)_', x).group(1) if re.search(r'_(\d+)_', x) else None)
df_kml = df_kml.drop(columns=['file_name'])
In [49]:
names_df = pd.read_csv("data/names.csv")

names_df['ID'] = names_df['ID'].astype(str)

# Map the 'Name' column in df_kml based on matching 'file_id' with 'ID' in names_df
df_kml = df_kml.merge(names_df, left_on='file_id', right_on='ID', how='left').drop(columns=['ID'])

# Rename 'Student' to 'Name' in df_kml
df_kml = df_kml.rename(columns={'Student': 'Name'})
In [50]:
df_majors = pd.read_csv("data/names_majors.csv")
In [51]:
df_majors['Program and Plan'] = df_majors['Program and Plan'].apply(
    lambda x: re.search(r' -\s*(.*?)\s*-', x).group(1) if isinstance(x, str) and re.search(r' -\s*(.*?)\s*-', x) else None
)
df_majors['Name'] = df_majors['Name'].str.replace(',', ', ', regex=False)
In [52]:
df_majors = df_majors.rename(columns={'Program and Plan': 'Program'})
In [53]:
# Convert the 'ID' in df_majors and 'file_id' in df_kml to strings if they're not already
df_majors['ID'] = df_majors['ID'].astype(str)
df_kml['file_id'] = df_kml['file_id'].astype(str)

# Merge df_kml with df_majors on the ID columns
df_merged = df_kml.merge(df_majors, left_on='Name', right_on='Name', how='left')

# Drop the redundant 'ID' column if you only want to keep 'file_id'
df_merged = df_merged.drop(columns=['ID'])
In [54]:
import pandas as pd
import plotly.express as px
import geopandas as gpd

# Calculate longitude and latitude for each geometry
df_merged['lon'] = df_merged['geometry'].apply(lambda geom: geom.centroid.x if geom else None)
df_merged['lat'] = df_merged['geometry'].apply(lambda geom: geom.centroid.y if geom else None)

Grouped by name¶

In [56]:
# Create a Mapbox scatter plot with color coding by Name
fig = px.scatter_mapbox(
    df_merged,
    lat="lat",
    lon="lon",
    color="Name",
    title="Map of Geometries by KML File",
    hover_name="Name",
    mapbox_style="carto-positron",  # Black and white style
    zoom=10  # Adjust the zoom level as needed
)

# Update layout to center on Harrisonburg, VA (coordinates: 38.4496, -78.8689)
fig.update_layout(
    mapbox=dict(
        center=dict(lat=38.4496, lon=-78.8689),
        zoom=10  # Higher zoom level for more focus on the center
    ),
    margin={"r":0, "t":0, "l":0, "b":0}
)

# Show the map
fig.show()

Grouped by Program¶

In [58]:
# Create a Mapbox scatter plot with color coding by Name
fig = px.scatter_mapbox(
    df_merged,
    lat="lat",
    lon="lon",
    color="Program",
    title="Map of Geometries by KML File",
    hover_name="Name",
    mapbox_style="carto-positron",  # Black and white style
    zoom=10  # Adjust the zoom level as needed
)

# Update layout to center on Harrisonburg, VA (coordinates: 38.4496, -78.8689)
fig.update_layout(
    mapbox=dict(
        center=dict(lat=38.4496, lon=-78.8689),
        zoom=10  # Higher zoom level for more focus on the center
    ),
    margin={"r":0, "t":0, "l":0, "b":0}
)

# Show the map
fig.show()

Group by level¶

In [60]:
# Create a Mapbox scatter plot with color coding by Name
fig = px.scatter_mapbox(
    df_merged,
    lat="lat",
    lon="lon",
    color="Level",
    title="Map of Geometries by KML File",
    hover_name="Name",
    mapbox_style="carto-positron",  # Black and white style
    zoom=10  # Adjust the zoom level as needed
)

# Update layout to center on Harrisonburg, VA (coordinates: 38.4496, -78.8689)
fig.update_layout(
    mapbox=dict(
        center=dict(lat=38.4496, lon=-78.8689),
        zoom=10  # Higher zoom level for more focus on the center
    ),
    margin={"r":0, "t":0, "l":0, "b":0}
)

# Show the map
fig.show()
In [61]:
import numpy as np
from scipy.spatial.distance import pdist

# Ensure lat and lon columns exist by calculating the centroids in df_merged if they don't exist
if 'lat' not in df_merged.columns or 'lon' not in df_merged.columns:
    df_merged['lon'] = df_merged['geometry'].apply(lambda geom: geom.centroid.x if geom else None)
    df_merged['lat'] = df_merged['geometry'].apply(lambda geom: geom.centroid.y if geom else None)

# Step 1: Calculate the average distance within each student's points
def average_distance_within_group(group):
    if len(group) < 2:  # If only one point, average distance is zero
        return 0
    coords = group[['lat', 'lon']].values
    return np.mean(pdist(coords))  # Calculate pairwise distances and take the mean

# Calculate avg_distance for each student and store as a DataFrame
student_avg_dist = df_merged.groupby('Name', as_index=False, group_keys=False).apply(
    lambda group: pd.Series({
        'avg_distance': average_distance_within_group(group)
    })
)
C:\Users\joost\AppData\Local\Temp\ipykernel_81680\3016280905.py:18: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

In [63]:
# Ensure 'avg_distance' is numeric
student_avg_dist['avg_distance'] = pd.to_numeric(student_avg_dist['avg_distance'], errors='coerce').fillna(0).astype(float)

# Step 2: Sort by avg_distance and divide students into 4 equal groups
student_avg_dist['group'] = pd.qcut(student_avg_dist['avg_distance'], 4, labels=False)+1
student_avg_dist['group'] = student_avg_dist['group'].astype(str)
In [65]:
# Step 3: Merge the 'group' information back into df_merged for visualization
df_merged['Name'] = df_merged['Name'].astype(str).str.strip()
student_avg_dist['Name'] = student_avg_dist['Name'].astype(str).str.strip()

df_merged = df_merged.merge(student_avg_dist[['Name', 'group']], on='Name', how='left')
In [67]:
# Create a new DataFrame with group names and unique student lists
df_merged['FirstName'] = df_merged['Name'].apply(
    lambda x: f"{x.split(', ')[1]} {x.split(', ')[0][0]}." if ', ' in x else x
)
In [69]:
grouped_names = (
    df_merged.groupby('group')['FirstName']
    .unique()
    .apply(lambda names: ', '.join(names))
    .reset_index()
)

# Create the "Group" label with group number and names
grouped_names['Group'] = grouped_names['group'].apply(lambda x: f"Group {x}")
grouped_names['Group'] = grouped_names['Group'] + ': ' + grouped_names['FirstName']

# Merge the new 'Group' label back into df_merged
df_merged = df_merged.merge(grouped_names[['group', 'Group']], on='group', how='left')

Clustered by distance from center¶

In [99]:
# Plot the map with custom hover text for each group
fig = px.scatter_mapbox(
    df_merged,
    lat="lat",
    lon="lon",
    color="Group",  # Color by group
    title="Student Groups Based on Clustering of Locations",
    hover_name="Name",
    mapbox_style="carto-positron",  # Black and white style
    zoom=10
)

# Set custom hover template for all points to show group name lists
fig.update_traces(hovertemplate="<b>%{custom_hover}</b><extra></extra>")

# Center the map on Harrisonburg, VA (38.4496, -78.8689)
fig.update_layout(
    mapbox=dict(
        center=dict(lat=38.4496, lon=-78.8689),
        zoom=10
    ),
    margin={"r":0, "t":0, "l":0, "b":0}
)

# Show the map
fig.show()
In [ ]: